16S rRNA amplicon sequencing QC

Author

Laura Symul

Published

December 24, 2025

Note

This document only perform minimal QC at the moment - a lot of the upstream QC has been performed by Joseph Elsherbini and Johnathan Shih in Kwon’s lab and Asavela Kama at CAPRISA (Ngcapu’s lab).

The main purpose of this document is to summarize the counts and relative abundances at the species level, and to harmonize and prepare the data so that it can be integrated with other assays for downstream analyses.

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(phyloseq)
#BiocManager::install("mia")
library(mia)


# tmp <- fs::dir_map("R scripts mg/", source)
tmp <- fs::dir_map("R scripts/", source)
tmp <- fs::dir_map("../VIBRANT-99-utils/R", source)
rm(tmp)

theme_set(theme_light())

Loading the data

The data is provided as a phyloseq object in an .rds file.

Code
ps <- 
  readRDS(str_c(get_data_dir(),"04 16S rRNA sequencing/raw_merged_all_pools_ps_20250523.rds"))

ps
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 283061 taxa and 4377 samples ]
sample_data() Sample Data:       [ 4377 samples by 28 sample variables ]
tax_table()   Taxonomy Table:    [ 283061 taxa by 7 taxonomic ranks ]

Sample data:

Code
ps@sam_data |> 
  as.data.frame() |> 
  as_tibble() |> 
  glimpse()
Rows: 4,377
Columns: 28
$ sampleID                                  <chr> "2152449_068200035_0000", "2…
$ Sample.ID.Barcode                         <chr> "2152449", "2154015", "21540…
$ Patient.ID                                <chr> "68200035", "68200040", "682…
$ Visit.code                                <chr> "0", "0", "1200", "1000", "1…
$ IDT.adapter.name                          <chr> "IDT10_UDI_Adapter2", "IDT10…
$ i5.index.forward                          <chr> "CGTAGAACAG", "ACCTAAGAGC", …
$ i5.index.reverse                          <chr> "CTGTTCTACG", "GCTCTTAGGT", …
$ i7.index.reverse                          <chr> "ATAAGCGGAG", "CGACCTTAAT", …
$ i7.index                                  <chr> "CTCCGCTTAT", "ATTAAGGTCG", …
$ sample_id                                 <chr> "2152449_068200035_0000", "2…
$ spike_in_Allobacillus_halotolerans_rel    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ spike_in_Imtechella_halotolerans_rel      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ spike_in_Allobacillus_halotolerans_counts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ spike_in_Imtechella_halotolerans_counts   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ total_spike_in_rel                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ total_spike_in_counts                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ CST                                       <chr> "III", "IV-B", "IV-B", "IV-B…
$ subCST                                    <chr> "III-B", "IV-B", "IV-B", "IV…
$ score                                     <dbl> 0.9200522, 0.6396295, 0.7183…
$ Specimen.type                             <chr> "Vaginal Swab", "Vaginal Swa…
$ Well.ID                                   <chr> "B01", "C01", "D01", "E01", …
$ pool                                      <chr> "pool1", "pool1", "pool1", "…
$ Volume.pooled..uL.                        <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
$ Researcher                                <chr> "Jiawu", "Jiawu", "Jiawu", "…
$ Description                               <chr> "VIBRANT", "VIBRANT", "VIBRA…
$ Pool10                                    <chr> NA, NA, NA, NA, NA, NA, NA, …
$ Original.MGH.aliquot.Unique.identifier    <dbl> NA, NA, NA, NA, NA, NA, NA, …
$ Site                                      <chr> NA, NA, NA, NA, NA, NA, NA, …

Taxonomic table:

Code
ps@tax_table |> as.data.frame() |> as_tibble() |> glimpse()
Rows: 283,061
Columns: 7
$ Domain  <chr> "d_Bacteria", "d_Bacteria", "d_Bacteria", "d_Bacteria", "d_Bac…
$ Phylum  <chr> "p_Bacillota", "p_Bacillota_A", "p_Bacillota", "p_Bacillota", …
$ Class   <chr> "c_Bacilli", "c_Clostridia", "c_Bacilli", "c_Bacilli", "c_Cori…
$ Order   <chr> "o_Lactobacillales", "o_Lachnospirales", "o_Lactobacillales", …
$ Family  <chr> "f_Lactobacillaceae", "f_Lachnospiraceae", "f_Lactobacillaceae…
$ Genus   <chr> "g_Lactobacillus", "g_UBA629", "g_Lactobacillus", "g_Lactobaci…
$ Species <chr> "Lactobacillus_crispatus", "Ca_Lachnocurva_vaginae", "Lactobac…

Data cleaning and harmonization

Dimensions and sparsity

The phyloseq object contains counts for 4377 samples and 283061 ASVs.

Code
counts <- ps@otu_table |> as.matrix()
prop_0 <- mean(counts == 0)
Code
props <- counts/rowSums(counts)
Code
class(counts) <- "matrix"
class(props) <- "matrix"

This is a very large number of ASVs (probably because dada2 denoises for sequencing errors but not amplification errors) and most of them are not present in most samples as the proportion of 0s in the count table is 99.93%.

Code
ASV_stats <- 
  tibble(
    ASV_nb = 1:ncol(counts),
    total_counts = colSums(counts),
    mean_counts = colMeans(counts),
    n_non_zero = colSums(counts > 0),
    f_non_zero = n_non_zero / nrow(counts),
    max_counts = colMaxs(counts),
    mean_rel_ab = colMeans(props),
    max_rel_ab = colMaxs(props)
  )
Code
ASV_stats |> ggplot() + aes(x = mean_counts) + geom_histogram(bins = 30) +
  scale_x_log10(n.breaks = 10) +
  labs(x = "Mean counts across samples (log scale)", y = "Number of ASVs") +
  
  
ASV_stats |> ggplot() + aes(x = max_counts) + geom_histogram(bins = 30) +
    scale_x_log10(n.breaks = 8) +
  labs(x = "Max number of counts (log scale)", y = "Number of ASVs") +
  
ASV_stats |> ggplot() + aes(x = n_non_zero) + geom_histogram(bins = 30) +
    scale_x_log10(n.breaks = 6) +
  labs(x = "Number of samples with non-zero counts(log scale)", y = "Number of ASVs") +
  
  
ASV_stats |> ggplot() + aes(x = max_rel_ab, fill = n_non_zero == 1) + 
  facet_grid(n_non_zero == 1 ~ ., scales = "free") + 
  geom_histogram(bins = 30) +
  scale_x_log10(n.breaks = 8) +
  labs(x = "Max rel. ab. (log scale)", y = "Number of ASVs", fill = "Singleton") +
  
  plot_layout(nrow = 2)

93.1% of ASVs (n = 263543) are present (non-zero counts) in only 1 sample.

Code
ASV_stats |>
  mutate(`ASV category` = ifelse(n_non_zero == 1, "ASVs with counts in only 1 sample (singletons)", "ASVs with counts in >1 sample")) |>
  group_by(`ASV category`) |> 
  summarize(
    `n ASVs` = n(), 
    `cummulative reads` = sum(total_counts), 
    `highest rel. ab.` = max(max_rel_ab),
    `average highest rel. ab.` = mean(max_rel_ab) |> round(5)
    ) |> 
  mutate(
    `% ASV` = round(`n ASVs` / sum(`n ASVs`) * 100, 2),
    `proportion of total reads` = round(`cummulative reads` / sum(`cummulative reads`) * 100, 2)
    ) |>
  relocate(`% ASV`, .after = `n ASVs`) |>
  relocate(`proportion of total reads`, .after = `cummulative reads`) |>
  gt() 
ASV category n ASVs % ASV cummulative reads proportion of total reads highest rel. ab. average highest rel. ab.
ASVs with counts in >1 sample 19518 6.9 212633574 97.02 1 0.00626
ASVs with counts in only 1 sample (singletons) 263543 93.1 6525875 2.98 1 0.00070

While, collectively, these ASVs (singletons) account for less than 3% of all counts, a few singletons make up a substantial proportion of the sample they are found in (up to 100% of the reads in a few samples).

Code
ASV_stats <- 
  ASV_stats |> 
  arrange(-max_rel_ab) |> 
  mutate(i = row_number(), singleton = (n_non_zero == 1))

ASV_stats |> 
  filter(i <= 10000) |> 
  ggplot() +
  aes(x = i, y = max_rel_ab, fill = singleton) +
  geom_col() +
  scale_x_continuous("ASV index (top 10 000 ASVs)") +
  scale_y_continuous("Max relative abundance\nacross samples", labels = scales::label_percent(accuracy = 1)) +
  
ASV_stats |> 
  filter(i <= 100) |> 
  ggplot() +
  aes(x = i, y = max_rel_ab, fill = singleton) +
  geom_col() +
  scale_x_continuous("ASV index (top 100 ASVs)") +
  scale_y_continuous("Max relative abundance\nacross samples", labels = scales::label_percent(accuracy = 1)) +
  
  plot_layout(nrow = 2, guides = "collect")

Code
singletons_to_check <- ASV_stats |> filter(singleton, max_rel_ab > 0.5)
Code
samples_to_check <- props[, singletons_to_check$ASV_nb] |> rowSums()
samples_to_check <- samples_to_check[samples_to_check > 0]
Code
props_to_check <- props[names(samples_to_check),]
props_to_check_long <- 
  props_to_check |> 
  as.data.frame() |> 
  rownames_to_column("sample_id") |> 
  pivot_longer(-sample_id, names_to = "ASV_seq", values_to = "rel_ab") |>
  filter(rel_ab > 0) |> 
  dplyr::left_join(
    ps@tax_table |> 
      as.data.frame() |> 
      as_tibble(rownames = "ASV_seq") |> 
      mutate(ASV_nb = row_number()),
    by = join_by(ASV_seq)
  ) 
Code
props_to_check_long |> 
  mutate(singleton =  ifelse(ASV_nb %in% singletons_to_check$ASV_nb, "Singleton", "Non-singleton")) |>
  ggplot() +
  aes(x = sample_id, y = str_c(Species, "(ASV #", ASV_nb, ")") , fill = rel_ab) +
  facet_grid(singleton ~ ., scales = "free", space = "free") +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "blue", labels = scales::label_percent(accuracy = 1)) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) +
  labs(x = "Sample ID", y = "ASV sequence", fill = "Rel. ab.") +
  ggtitle("Relative abundance of singleton ASVs with max rel. ab. > 0.5") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)
  )

Many of these “high relative abundances singletons” are in negative controls.

Code
# tmp <- 
#   ps@tax_table |> 
#   as.data.frame() 
# 
# j <-  which(tmp$Species == "Lactobacillus_crispatus")
# 
# tmp <- counts[,j]
# tmp <- tmp |> as.data.frame() 
# 
# tmp_long <- 
#   tmp |> 
#   set_colnames(1:ncol(tmp)) |> 
#   rownames_to_column("sample_id") |> 
#   pivot_longer(-sample_id, names_to = "L_crisp_ASV_nb", values_to = "counts")
# 
# tmp_long |> 
#   ggplot() +
#   aes(x = sample_id, y = L_crisp_ASV_nb, fill = counts |> asinh()) +
#   geom_tile() +
#   scale_fill_gradient(low = "white", high = "blue") 
Code
rm(counts, ASV_stats)

Creation a of SummarizedExperiment object

To facilitate data harmonization and cleaning, we transform the phyloseq object to a SummarizedExperiment object summarized at the species level to reduce the size of the object.

Code
# throws an error
ps_agg <- ps |> tax_glom("Species") 
Note

At a later stage, we could keep either the full ASV counts, or a filtered version of the full ASV counts to only keep non-singleton ASVs and aggregate the singleton at the species level, but at this stage, for most analyses, the species level is sufficient.

Code
make_se_from_ps <- function(ps) {
  
  # ROWDATA (Tax table)
  asv_tax_table <- 
    ps@tax_table |> 
    as.data.frame() |>
    rownames_to_column("ASV_sequence") |> 
    mutate(ASV_nb = row_number()) |>
    group_by(Species) |> 
    mutate(ASV_id = str_c(Species, " (ASV", ASV_nb |> str_pad(width = 6, pad = "0"),")")) |> 
    ungroup() |> 
    mutate(rownames = ASV_id) |> 
    column_to_rownames("rownames") |> 
    relocate(ASV_sequence, .after = ASV_id)  |> 
    janitor::clean_names()
  
  # (asv_tax_table$species |> unique() |> length()) == (asv_tax_table |> dplyr::count(domain, phylum, class, order, family, genus, species) |> nrow())
  
  se_rowdata <- 
    asv_tax_table |> 
    group_by(domain, phylum, class, order, family, genus, species) |> 
    summarize(
      n_asv = n(),
      asv_ids = str_c(asv_id, collapse = ", "),
      .groups = "drop"
    )
  
  # we do some cleaning of the names
  se_rowdata <- 
    se_rowdata |> 
    mutate(
      taxon_id = species,
      domain = domain |> str_remove("^d_"),
      phylum = phylum |> str_remove("^[dp]_") |> 
        str_replace(" Domain", " (Domain)"),
      class = class |> str_remove("^[dpc]_") |> 
        str_replace(" Domain", " (Domain)"),
      order = order |> str_remove("^[dpco]_") |> 
        str_replace(" Domain", " (Domain)") |> 
        str_replace(" Class", " (Class)"),
      family = family |> str_remove("^[dpcof]_") |> 
        str_replace(" Domain", " (Domain)") |> 
        str_replace(" Class", " (Class)") |> 
        str_replace(" Order", " (Order)"),
      genus = genus |> str_remove("^[dpcofg]_") |> 
        str_replace(" Domain", " (Domain)") |> 
        str_replace(" Class", " (Class)") |> 
        str_replace(" Order", " (Order)") |>
        str_replace(" Family", " (Family)"),
      species = species |> str_remove("^[dpcofg]_") |>
        str_replace(" Domain", " (Domain)") |> 
        str_replace(" Class", " (Class)") |> 
        str_replace(" Order", " (Order)") |>
        str_replace(" Family", " (Family)") |>
        str_replace(" Genus", " (Genus)") |> 
        str_replace_all("_", " "),
      last_available_taxonomic_level = 
        species |> 
        str_extract("\\(Domain\\)|\\(Class\\)|\\(Order\\)|\\(Family\\)|\\(Genus\\)") |> 
        str_remove("\\(") |> str_remove("\\)") |> 
        str_replace_na("Species"),
      taxon_label = species
    ) |> 
    select(taxon_id, taxon_label, everything())
  
  se_rowdata <- 
    se_rowdata |> 
    as.data.frame() |> 
    mutate(rownames = taxon_id) |> 
    column_to_rownames("rownames") 
  
  rowData_dictionary <- 
    tibble(name = colnames(se_rowdata)) |> 
    dplyr::left_join(
      tibble(original_name = ps@tax_table |> colnames()) |> 
        mutate(name = original_name |> str_to_lower()),
      by = join_by(name)
    ) |> 
    mutate(
      description = 
      case_when(
        name == "taxon_id" ~ "Unique taxon identifier",
        name == "taxon_label" ~ "Taxon label",
        name == "domain" ~ "Domain",
        name == "phylum" ~ "Phylum",
        name == "class" ~ "Class",
        name == "order" ~ "Order",
        name == "family" ~ "Family",
        name == "genus" ~ "Genus",
        name == "species" ~ "Species",
        name == "n_asv" ~ "Number of ASVs for this taxon",
        name == "asv_ids" ~ "Identifiers of the ASVs for this taxon",
        name == "last_available_taxonomic_level" ~ "Last available taxonomic level",
        TRUE ~ "???"
      )
    )
  
  
  # COLDATA (Sample data)
  se_coldata <-
    ps@sam_data |> 
    as.data.frame() |> 
    as_tibble() |>  # weird trick because of the sticky `phyloseq` attribute
    as.data.frame() |> 
    set_rownames(rownames(ps@sam_data))
  
  colData_dictionary <- 
    tibble(
      original_name = se_coldata |> colnames()
    )
  
  se_coldata <- se_coldata |> janitor::clean_names()
  se_coldata <- se_coldata |> dplyr::rename(visit_code_ps = visit_code)
  
  colData_dictionary <- colData_dictionary |> bind_cols(name = colnames(se_coldata))
  
  se_coldata <- se_coldata |> mutate(sample_id_16s = rownames(se_coldata)) 
   

  # ASSSAY
  raw_counts <- ps@otu_table |> as.matrix()
  class(raw_counts) <- "matrix"
  raw_counts <- raw_counts |> t()
  rownames(raw_counts) <- asv_tax_table$asv_id
  
  tmp <- 
    raw_counts |> 
    as.data.frame() |>
    mutate(species = asv_tax_table$species) 
  
  counts_assay <- 
    tmp |> 
    group_by(species) |> 
    summarize(across(everything(), sum)) |> 
    column_to_rownames("species")
  
  counts_assay <- counts_assay[se_rowdata$taxon_id,]
  
  # SummarizedExperiment
  
  se <- SummarizedExperiment(
    assays = list(
      counts = counts_assay,
      rel_ab = t(t(counts_assay) / colSums(counts_assay))
    ),
    rowData = se_rowdata,
    colData = se_coldata,
    metadata = list(
      name = "VIBRANT 16S rRNA amplicon sequencing",
      date = today(),
      dictionary = 
        bind_rows(
          colData_dictionary |> mutate(table = "colData"),
          rowData_dictionary |> mutate(table = "rowData")
          ),
      asv_tax_table = asv_tax_table
    )
  )
  
  
  # reordering taxa
  taxa_order <- 
    se |> as_tibble() |> 
    group_by(.feature) |> summarize(mean_rel_ab = mean(rel_ab)) |> 
    arrange(-mean_rel_ab) |> pull(.feature)
  
  se <- se[taxa_order, ]
  
  
  se
}
Code
se_16S <- make_se_from_ps(ps)
se_16S_backup <- se_16S
Code
se_16S
# A SummarizedExperiment-tibble abstraction: 3,576,009 × 45
# Features=817 | Samples=4377 | Assays=counts, rel_ab
   .feature        .sample counts  rel_ab sample_id sample_id_barcode patient_id
   <chr>           <chr>    <dbl>   <dbl> <chr>     <chr>             <chr>     
 1 Lactobacillus_… 215244…  65218 7.21e-1 2152449_… 2152449           68200035  
 2 Lactobacillus_… 215244…   2569 2.84e-2 2152449_… 2152449           68200035  
 3 Gardnerella_va… 215244…  12469 1.38e-1 2152449_… 2152449           68200035  
 4 Ca_Lachnocurva… 215244…     25 2.76e-4 2152449_… 2152449           68200035  
 5 Sneathia_vagin… 215244…      0 0       2152449_… 2152449           68200035  
 6 Fannyhessea_va… 215244…    871 9.63e-3 2152449_… 2152449           68200035  
 7 Sneathia_sangu… 215244…      6 6.63e-5 2152449_… 2152449           68200035  
 8 Prevotella_biv… 215244…    293 3.24e-3 2152449_… 2152449           68200035  
 9 Prevotella_amn… 215244…      0 0       2152449_… 2152449           68200035  
10 f_Dethiosulfov… 215244…      0 0       2152449_… 2152449           68200035  
# ℹ 40 more rows
# ℹ 38 more variables: visit_code_ps <chr>, idt_adapter_name <chr>,
#   i5_index_forward <chr>, i5_index_reverse <chr>, i7_index_reverse <chr>,
#   i7_index <chr>, sample_id_2 <chr>,
#   spike_in_allobacillus_halotolerans_rel <dbl>,
#   spike_in_imtechella_halotolerans_rel <dbl>,
#   spike_in_allobacillus_halotolerans_counts <dbl>, …

We renamed the columns in the sample data:

Code
se_16S@metadata$dictionary |> gt()
original_name name table description
sampleID sample_id colData NA
Sample.ID.Barcode sample_id_barcode colData NA
Patient.ID patient_id colData NA
Visit.code visit_code_ps colData NA
IDT.adapter.name idt_adapter_name colData NA
i5.index.forward i5_index_forward colData NA
i5.index.reverse i5_index_reverse colData NA
i7.index.reverse i7_index_reverse colData NA
i7.index i7_index colData NA
sample_id sample_id_2 colData NA
spike_in_Allobacillus_halotolerans_rel spike_in_allobacillus_halotolerans_rel colData NA
spike_in_Imtechella_halotolerans_rel spike_in_imtechella_halotolerans_rel colData NA
spike_in_Allobacillus_halotolerans_counts spike_in_allobacillus_halotolerans_counts colData NA
spike_in_Imtechella_halotolerans_counts spike_in_imtechella_halotolerans_counts colData NA
total_spike_in_rel total_spike_in_rel colData NA
total_spike_in_counts total_spike_in_counts colData NA
CST cst colData NA
subCST sub_cst colData NA
score score colData NA
Specimen.type specimen_type colData NA
Well.ID well_id colData NA
pool pool colData NA
Volume.pooled..uL. volume_pooled_u_l colData NA
Researcher researcher colData NA
Description description colData NA
Pool10 pool10 colData NA
Original.MGH.aliquot.Unique.identifier original_mgh_aliquot_unique_identifier colData NA
Site site colData NA
NA taxon_id rowData Unique taxon identifier
NA taxon_label rowData Taxon label
Domain domain rowData Domain
Phylum phylum rowData Phylum
Class class rowData Class
Order order rowData Order
Family family rowData Family
Genus genus rowData Genus
Species species rowData Species
NA n_asv rowData Number of ASVs for this taxon
NA asv_ids rowData Identifiers of the ASVs for this taxon
NA last_available_taxonomic_level rowData Last available taxonomic level

Harmonization of sample and participant IDs

We have data for the following samples:

Code
se_16S@colData |> 
  as.data.frame() |> 
  ggplot() +
  aes(x = patient_id, y = visit_code_ps) +
  geom_point() +
  facet_grid(. ~ site, scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  )

We see that we’ll have to harmonize the patient_id and visit_code columns. From harmonized participant ID, we’ll be able to impute the site location.

Code
se_16S@colData <- 
  se_16S@colData |> 
  as.data.frame() |> 
  mutate(
    pid = 
      patient_id |> 
      str_remove_all("_") |> 
      str_remove_all("-") |> 
      str_replace_all("^68","068") 
  ) |> 
  DataFrame()
Code
se_16S@colData |> 
  as.data.frame() |> 
  mutate(pid_length = str_length(pid)) |> 
  dplyr::filter(pid_length != 9) |> 
  dplyr::count(patient_id, pid) |> 
  gt(
    caption = "Samples with non-standard participant IDs"
  )
Samples with non-standard participant IDs
patient_id pid n
98120009 98120009 7
Amplification NEG control Amplification NEG control 27
Amplification POS control Amplification POS control 42
Mock 1 Mock 1 44
Mock 2 Mock 2 46
Nuclease-free water Nucleasefree water 5
Nuclease_free water Nucleasefree water 29
Unused swab_C2 Unused swabC2 34
Note

As in the metagenomics, there were 7 samples from a participant from another study (starting by (0)98).

The rest looks legit.

We next clean the visit_code_ps

Code
 se_16S@colData |> 
  as.data.frame() |> 
  dplyr::count(visit_code_ps)
Code
se_16S@colData <- 
  se_16S@colData |> 
  as.data.frame() |> 
  mutate(
    visit_code = 
      visit_code_ps |> 
      str_replace("V7", "1700") |> 
      str_replace("V8", "1900") |>
      str_replace("V9", "2120") |> 
      str_pad(width = 4, pad = "0") 
  ) |> 
  DataFrame()

We also create new columns for the sample_type and control_type

Code
se_16S@colData <- 
  se_16S@colData |> 
  as.data.frame() |> 
  mutate(
    sample_type = 
      case_when(
        str_detect(pid, "^068") ~ "Clinical sample",
        str_detect(pid, "98") ~ "Clinical sample (other study)",
        str_detect(pid, "Amplification NEG control") | str_detect(sample_id, "Amplification_NEG_control") ~ "Amplification negative control",
        str_detect(pid, "Amplification POS control") | str_detect(sample_id, "Amplification_POS_control") ~ "Amplification positive control",
        str_detect(pid, "Mock") | str_detect(sample_id, "Mock") ~ "Positive control",
        str_detect(pid, "water") | str_detect(pid, "Unused") | str_detect(sample_id, "water") | str_detect(sample_id, "Unused")  ~ "Negative control",
      ),
    control_type = 
      case_when(
        str_detect(sample_type, "Clinical") ~ "",
        str_detect(sample_id, "Mock1") ~ "Mock 1",
        str_detect(pid, "Mock") ~ pid,
        str_detect(pid, "water") | str_detect(sample_id, "water") ~ "Nuclease-free water",
        str_detect(pid, "Unused") | str_detect(sample_id, "Unused")  ~ "Unused swab + C2",
        TRUE ~ sample_type
      )
  ) |> 
  DataFrame()
Code
se_16S@colData |> 
  as.data.frame() |> 
  dplyr::count(sample_type, control_type) |> 
  gt()
sample_type control_type n
Amplification negative control Amplification negative control 29
Amplification positive control Amplification positive control 47
Clinical sample 4133
Clinical sample (other study) 7
Negative control Nuclease-free water 35
Negative control Unused swab + C2 35
Positive control Mock 1 45
Positive control Mock 2 46
Code
se_16S$pid <- ifelse(!(se_16S$sample_type %in% c("Clinical sample", "Clinical sample (other study)")), NA_character_, se_16S$pid)

We also create two columns (ext_lib_plate_type and ext_lib_plate_nb) that contains information about the extraction library plate (from Sinaye) so that we’ll be able to compare the controls between 16S and metagenomics (the barcodes do not seem to match those provided by Michael - there, in the metagenomics data, the biological controls barcode start with “EQ”)

Note

Does that “EQ”-starting barcodes sound familiar or not at all?

Code
tmp <- 
  se_16S@colData |> 
  as_tibble() |> 
  dplyr::filter(sample_type %in% c("Positive control", "Negative control")) |> 
  select(sample_id, sample_type, control_type) |> 
  mutate(
    ext_lib_plate_type = 
      case_when(
        str_detect(sample_id, "^dailyplate") ~ "CAP daily swabs",
        str_detect(sample_id, "^mghdailyplate") ~ "MGH daily swabs",
        str_detect(sample_id, "^plate") ~ "Weekly swabs",
        str_detect(sample_id, "_plate1") ~ "Weekly swabs",
        TRUE ~ "???"
      ),
    ext_lib_plate_nb = sample_id |> str_remove("Mock1_") |> parse_number()
  ) 
Code
tmp |> 
  ggplot() +
  aes(x = ext_lib_plate_nb |> factor(), y = control_type, col = control_type) +
  facet_grid(sample_type ~ ext_lib_plate_type, scales = "free", space = "free") +
  geom_point() +
  xlab("Extraction library plate number") +
  ylab("") +
  guides(col = "none") +
  theme(strip.text.y = element_text(angle = 0))

Code
tmp <- 
  se_16S@colData |> 
  as.data.frame() |> 
  select(sample_id) |>
  dplyr::left_join(tmp |> select(sample_id, starts_with("ext_lib_plate")), by = "sample_id") 

se_16S@colData$ext_lib_plate_type <- tmp$ext_lib_plate_type
se_16S@colData$ext_lib_plate_nb <- tmp$ext_lib_plate_nb

Number of observation per pid and visit_code

After harmonizing the pid and visit_code, we notice that a number of participants have two samples per visit.

All of these “replicates” are from “pool10” which was composed of samples that had to be re-sequenced.

Code
replicates <- 
  se_16S@colData |> 
  as.data.frame() |> 
  as_tibble() |> 
  dplyr::filter(sample_type == "Clinical sample")

replicates |> 
  dplyr::count(pid, visit_code, name = "n samples per participant x visit") |> 
  dplyr::count(`n samples per participant x visit`, name = "occurence") |> 
  gt()
n samples per participant x visit occurence
1 3797
2 168
Code
# se_16S@colData |> 
#   as.data.frame() |> 
#   as_tibble() |> 
#   dplyr::filter(sample_type == "Clinical sample") |> 
#   dplyr::count(patient_id, visit_code_ps) |> 
#   dplyr::count(n)


replicates <- 
  replicates |>
  group_by(pid, visit_code) |> 
  arrange(visit_code) |>  
  mutate(n = n(), rep_nb = row_number()) |> 
  ungroup() |> 
  dplyr::filter(n > 1) |> 
  select(sample_id, patient_id, pid, visit_code_ps, visit_code, rep_nb, pool) |> 
  arrange(pid, visit_code_ps) 

replicates |> 
  dplyr::count(rep_nb, pool) |> 
  gt()
rep_nb pool n
1 pool1 29
1 pool2 26
1 pool3 2
1 pool5 73
1 pool6 19
1 pool7 3
1 pool8 14
1 pool9 2
2 pool10 168

We’ll do a comparison of the composition of these samples below.

Code
se_16S@colData$resequenced <- "No"
se_16S@colData$resequenced[se_16S@colData$sample_id %in% replicates$sample_id[replicates$pool != "pool10"]] <- "Yes"
se_16S@colData$resequenced[se_16S@colData$sample_id %in% replicates$sample_id[replicates$pool == "pool10"]] <- "2nd sequencing"

se_16S@colData$resequenced <- se_16S@colData$resequenced |> factor(levels = c("No", "Yes", "2nd sequencing"))

Matching pid and visit_code with the CRF data

Note

Below, I load a list of all visits for all participants as found in the CRF data - this is NOT based on CRF-based specimen collection data (CRF35 or CRF47). That comparison will be done after the MultiAssayExperiment object is built.

TODO: change the path and file this loads after finalization of clinical data cleaning.

Code
load("/Users/laurasymul/OneDrive - UCL/Academia/Research/VIBRANT clinical data UCLouvain/Data processed/2025-05-14/visits_summary.RData", verbose = TRUE)
Loading objects:
  v
Code
v <- v |> dplyr::rename(visit_code_crf = visit_code) |> mutate(pid = str_c("068", pid))
Code
v <- 
  v |> 
  mutate(visit_code = visit_code_crf |> as.character() |> str_pad(width = 4, pad = "0")) 
Code
v |> 
  ggplot() +
  aes(x = pid, y = visit_code, col = randomized) +
  geom_point() +
  facet_grid(. ~ location + randomized, scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) + 
  ggtitle("CRF data")
Code
matched <- 
  dplyr::full_join(
    se_16S@colData |> 
      as.data.frame() |> 
      dplyr::filter(sample_type == "Clinical sample") |> 
      as_tibble() |> 
      select(pid, visit_code, patient_id, visit_code_ps, sample_id, site) |> 
      mutate(has_16S_data = TRUE),
    v |> select(pid, visit_code) |>  mutate(has_CRF_data = TRUE),
    by = join_by(pid, visit_code)
  ) |> 
  mutate(
    has_16S_data = has_16S_data |> replace_na(FALSE),
    has_CRF_data = has_CRF_data |> replace_na(FALSE)
  ) |> 
  dplyr::left_join(
    v |> select(pid, location, met_eligibility, randomized) |> distinct() |> mutate(pid_in_CRFs = TRUE),
    by = join_by(pid)
  ) |> 
  mutate(
    pid_in_CRFs = pid_in_CRFs |> replace_na(FALSE)
  )
Code
matched |> 
  dplyr::count(has_16S_data, has_CRF_data, pid_in_CRFs, randomized, name = "n entries") |> 
  gt()
has_16S_data has_CRF_data pid_in_CRFs randomized n entries
FALSE TRUE TRUE FALSE 414
FALSE TRUE TRUE TRUE 188
TRUE FALSE FALSE NA 9
TRUE FALSE TRUE FALSE 3
TRUE FALSE TRUE TRUE 110
TRUE TRUE TRUE FALSE 75
TRUE TRUE TRUE TRUE 3936
Code
matched |> 
  ggplot() +
  aes(x = pid, y = visit_code, col = has_16S_data, shape = has_CRF_data) +
  geom_point() +
  facet_grid(. ~ location + ifelse(randomized,"Randomized", "Not randomized"), scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) + 
  scale_shape_manual(values = c(1, 16)) +
  ggtitle("matched data")

Let’s focus on

  • “Clinical samples” in the 16S data that we do not have in the CRFs
  • Randomized participants with non-matching visits in the 16S data.

Participant ID not in the CRFs

Code
matched |> 
  dplyr::filter(is.na(location)) |> 
  dplyr::count(pid, patient_id) |> 
  gt()
pid patient_id n
0681000xx 068_10_00xx 1
068200000 068200000 3
068200000 68200000 5

These are likely “test” participants.

We change their sample_type to “Test sample”.

Code
se_16S$sample_type <- ifelse(se_16S$pid %in% c("0681000xx", "068200000"), "Test sample", se_16S$sample_type) 

Randomized participants with non-matching visits in the 16S data

Code
matched |> 
  dplyr::filter(randomized) |> 
   ggplot() +
  aes(x = pid, y = visit_code, col = has_16S_data, shape = has_CRF_data) +
  geom_point(size = 1.5) +
  facet_grid(. ~ location + ifelse(randomized,"Randomized", "Not randomized"), scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) + 
  scale_shape_manual(values = c(1, 16)) +
  ggtitle("matched data")

We have a quite ok match. Some observations around the mismatches:

  • We see a few examples of likely mis-labelled visits (e.g., around the screening visits)

  • and a few “missing” 16S at visits where swabs are not expected (e.g. 2121 or 1511).

  • One SA participant has a whole week of missed swabs, but another week without crfs but with swabs.

  • A few US participants were supposed to collect swabs but likely did not (to check with the “daily specimen collection CRF” (CRF47) after MAE creation)

Overall, it looks reasonable.

Fixing visit_code and uids

We fix a few visit_code based on the CRF data, trial information, or based on information provided by the different labs.

Code
se_16S@colData |> 
  as_tibble() |> 
  dplyr::filter(visit_code == "8999") |> 
  select(sample_id, patient_id, pid, visit_code_ps, visit_code)


se_16S@colData |> 
  as_tibble() |> 
  dplyr::filter(visit_code == "0011") |> 
  select(sample_id, patient_id, pid, visit_code_ps, visit_code)

se_16S@colData |> 
  as_tibble() |> 
  dplyr::filter(pid == "068200023") |> 
  select(sample_id, patient_id, pid, visit_code_ps, visit_code)


matched |> 
  dplyr::filter(randomized) |> 
  dplyr::filter(has_16S_data, !has_CRF_data) |> 
  arrange(pid, visit_code) |> View()
Code
se_16S@colData <- 
  se_16S@colData |> 
  as.data.frame() |> 
  mutate(
    visit_code = 
      case_when(
        # one MGH participant had their visit code set to the termination visit code. 
        # However, for the other assays, it was the actual visit code that was used instead.
        # for that participant (068100050), their last visit was 1700
        (pid == "068100050") & (visit_code == "8999") ~ "1700", 
        # two CAP participants had "0011" as their visit code. 
        # these should be replaced by "0010"
        (visit_code == "0011") ~ "0010",
        TRUE ~ visit_code
      )
  ) |> 
  DataFrame()

Exploratory & QC analyses

Distribution of counts and relative abundances per taxon

Code
taxon_summary <- 
  se_16S |> 
  as_tibble() |> 
  group_by(.feature) |> 
  summarize(
    n_samples = n(),
    n_non_zero_samples = sum(counts > 0, na.rm = TRUE),
    mean_rel_ab = (mean(rel_ab, na.rm = TRUE) * 100) |> round(2),
    max_rel_ab = (max(rel_ab, na.rm = TRUE) * 100) |> round(2),
    sd_rel_ab = (sd(rel_ab, na.rm = TRUE) * 100) |> round(2),
    pc_10 = (quantile(rel_ab, 0.1, na.rm = TRUE) * 100) |> round(2),
    pc_50 = (quantile(rel_ab, 0.5, na.rm = TRUE) * 100) |> round(2),
    pc_90 = (quantile(rel_ab, 0.9, na.rm = TRUE) * 100) |> round(2)
  ) 
Code
taxon_summary |> 
  arrange(-n_non_zero_samples) |> 
  slice_head(n = 20) |> 
  select(.feature, n_samples, n_non_zero_samples, mean_rel_ab, max_rel_ab, pc_10, pc_50, pc_90) |> 
  gt() |> 
  cols_label(
    .feature = "taxon",
    n_samples = "N samples",
    n_non_zero_samples = "n samples with non-zero counts",
    mean_rel_ab = "Average relative abundance across samples (%)",
    max_rel_ab = "Maximum relative abundance across samples (%)",
    pc_10 = "10th percentile relative abundance across samples (%)",
    pc_50 = "Median relative abundance across samples (%)",
    pc_90 = "90th percentile relative abundance across samples (%)"
  )
taxon N samples n samples with non-zero counts Average relative abundance across samples (%) Maximum relative abundance across samples (%) 10th percentile relative abundance across samples (%) Median relative abundance across samples (%) 90th percentile relative abundance across samples (%)
d_Bacteria Domain 4377 4000 1.74 100.00 0.01 0.49 4.08
Gardnerella_vaginalis 4377 3944 9.73 100.00 0.00 5.54 25.89
Lactobacillus_iners 4377 3856 24.97 100.00 0.00 9.20 79.60
g_Lactobacillus Genus 4377 3698 1.40 17.34 0.00 0.54 4.17
g_Nanoperiomorbus Genus 4377 3495 0.64 100.00 0.00 0.09 1.42
f_Dethiosulfovibrionaceae Family 4377 3206 1.96 100.00 0.00 0.10 5.15
Fannyhessea_vaginae 4377 3191 3.60 73.26 0.00 0.39 11.19
g_Dialister Genus 4377 3188 0.32 8.95 0.00 0.12 0.85
g_Prevotella Genus 4377 3105 1.66 62.50 0.00 0.07 3.74
f_Bifidobacteriaceae Family 4377 3047 0.14 7.58 0.00 0.03 0.36
g_Sneathia Genus 4377 2897 1.01 50.07 0.00 0.08 3.52
Sneathia_vaginalis 4377 2849 4.94 62.92 0.00 0.10 18.80
Prevotella_timonensis 4377 2808 1.33 53.44 0.00 0.06 3.51
Dialister_micraerophilus 4377 2745 0.11 18.03 0.00 0.03 0.27
Aerococcus_christensenii 4377 2559 1.16 54.00 0.00 0.02 1.28
Sneathia_sanguinegens 4377 2534 2.62 51.69 0.00 0.04 9.02
Berryella_sp001552935 4377 2521 0.82 20.51 0.00 0.03 2.76
g_Peptoniphilug_A Genus 4377 2440 0.08 1.62 0.00 0.01 0.26
Finegoldia_magna_H 4377 2431 0.47 29.27 0.00 0.01 0.80
Lactobacillus_crispatus 4377 2411 17.41 100.00 0.00 0.01 79.87

Total number of counts per sample.

Code
se_16S@colData$total_counts <- colSums(assay(se_16S, "counts"))
Code
se_16S@colData |> 
  as.data.frame() |> 
  ggplot() +
  aes(x = total_counts, fill = control_type) +
  geom_histogram(bins = 30) +
  scale_x_log10(n.breaks = 8) +
  labs(x = "Total counts (log scale)", y = "Nb of samples") +
  facet_grid(sample_type ~ ., scales = "free") +
  theme(strip.text.y = element_text(angle = 0))

Code
se_16S@colData |> 
  as.data.frame() |> 
  dplyr::filter(sample_type == "Clinical sample") |> 
  ggplot() +
  aes(x = resequenced, y = total_counts, fill = resequenced) +
  # geom_histogram(bins = 30) +
  geom_hline( yintercept = 5000, col = "red", linetype = "dashed") +
  geom_violin(alpha = 0.3, col = "transparent", scale = "width") +
  ggbeeswarm::geom_quasirandom(alpha = 0.5, size = 0.1) +
  geom_path(aes(group = interaction(pid, visit_code)), alpha = 0.1) +
  scale_y_log10(n.breaks = 8) +
  guides(fill = "none") +
  labs(x = "Re-sequenced", y = "Total counts (log scale)") +
  ggtitle("Clinical samples only")

We recommend excluding samples with less than 5000 counts for downstream analyses.

Code
tmp <- 
  se_16S@colData |> 
  as.data.frame() |> 
  dplyr::filter(!is.na(visit_code)) |> 
  group_by(visit_code) |> mutate(n = n()) |> ungroup() |> 
  dplyr::filter(n > 20) |> 
  mutate(visit_code = visit_code |> factor()) |> 
  arrange(pid, visit_code)

tmp |> 
  ggplot() +
  aes(y = total_counts, x = visit_code, fill = visit_code, col = visit_code) +
  # geom_path(aes(group = pid), alpha = 0.15) +
  geom_violin(draw_quantiles = 0.5, alpha = 0.5) + # fill = "gray", 
  scale_y_log10(n.breaks = 10) +
  guides(fill = "none", col = "none") +
  labs(y = "Total counts (log scale)", x = "Visit") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

Controls

Amplification negative controls

Code
tmp <- 
  se_16S |> 
  dplyr::filter(sample_type == "Amplification negative control") |> 
  as_tibble() |> 
  group_by(.feature) |>
  mutate(max_rel_ab = max(rel_ab)) |> 
  ungroup() |> 
  dplyr::filter(max_rel_ab > 0.2)

tmp |> 
  ggplot() +
  aes(x = sample_id, y = rel_ab, fill = .feature) +
  geom_col(col = "white", linewidth = 0.1) +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(. ~ control_type, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

Amplification positive controls

Code
tmp <- 
  se_16S |> 
  dplyr::filter(sample_type == "Amplification positive control") |> 
  as_tibble() |> 
  group_by(.feature) |>
  mutate(max_rel_ab = max(rel_ab)) |> 
  ungroup() |> 
  dplyr::filter(max_rel_ab > 0.1)

tmp |> 
  ggplot() +
  aes(x = sample_id, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(. ~ control_type, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

There is one outlying amplification positive control -> we should maybe check with the labs if there is an explanation for this. Since it’s not associated with a plate number, it might be a test sample and nothing to worry about for the quality of the data.

Note

Anyone from the labs that can explain this outlier? And/or if it’s important?

Negative controls

Code
tmp <- 
  se_16S |> 
  dplyr::filter(sample_type == "Negative control") |> 
  as_tibble() |> 
  group_by(.feature) |>
  mutate(max_rel_ab = max(rel_ab)) |> 
  ungroup() |> 
  dplyr::filter(max_rel_ab > 0.1)

tmp |> 
  ggplot() +
  aes(x = sample_id, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(. ~ ext_lib_plate_type + control_type, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

Positive controls

Code
tmp <- 
  se_16S |> 
  dplyr::filter(sample_type == "Positive control") |> 
  as_tibble() |> 
  group_by(.feature) |>
  mutate(max_rel_ab = max(rel_ab)) |> 
  ungroup() |> 
  dplyr::filter(max_rel_ab > 0.01)

tmp |> 
  ggplot() +
  aes(x = sample_id, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(. ~ control_type + ext_lib_plate_type, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

That gives us an idea of the replicability and expected variability in the 16S data.

Replicates

Code
tmp <- 
  se_16S |> 
  as_tibble() |> 
  dplyr::filter(sample_type == "Clinical sample") |> 
  group_by(pid, visit_code) |> mutate(n = sample_id |> unique() |> length()) |> ungroup() |>
  dplyr::filter(n > 1) |> 
  dplyr::filter(.feature %in% .feature[1:20])
  # group_by(.feature) |>
  # mutate(max_rel_ab = max(rel_ab)) |> 
  # ungroup() |> 
  # dplyr::filter(max_rel_ab > 0.1)

tmp |> 
  ggplot() +
  aes(x = pool |> parse_number() |> factor(), y = rel_ab, fill = .feature, alpha = total_counts |> log10()) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_wrap(. ~ pid + visit_code, scales = "free_x", nrow = 3) +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    ) +
  xlab("pool")

Relatively good agreement between the replicates; for the longitudinal profiles below, when we have replicates, we’ll use the replicate that had the most total reads.

Longitudinal profiles for randomized participants

Code
tmp <- 
  se_16S@colData |> 
  as.data.frame() |> 
  select(-any_of(c("randomized", "location"))) |> 
  dplyr::left_join(
    v |> select(pid, randomized, location) |> distinct(),
    by = join_by(pid)
  )

se_16S@colData$randomized <- tmp$randomized
se_16S@colData$location <- tmp$location
Code
n_taxa <- 20

tmp <- 
  se_16S |> 
  dplyr::filter(randomized) |> 
  as_tibble() |> 
  dplyr::filter(.feature %in% .feature[1:n_taxa]) 

selected_replicates <- 
  tmp |> 
  select(sample_id, pid, visit_code, pool, total_counts) |> 
  distinct() |> 
  arrange(pid, visit_code, -total_counts) |> 
  group_by(pid, visit_code) |>
  slice_head(n = 1) |> 
  ungroup() |> 
  group_by(visit_code) |> mutate(n = n()) |> ungroup() |> 
  dplyr::filter(n > 20)
Code
tmp |> 
  dplyr::filter(sample_id %in% selected_replicates$sample_id) |>
  ggplot() +
  aes(x = pid, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", n_taxa, " taxa"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(visit_code ~ location, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

Code
tmp |> 
  dplyr::filter(sample_id %in% selected_replicates$sample_id) |>
  ggplot() +
  aes(x = visit_code, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", n_taxa, " taxa"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  ggh4x::facet_nested(location + pid ~ ., scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

Save SummarizedExperiment objects

We save two SE objects to disk:

  • se_16S_raw: one that has all the controls and replicates (current se_16S object)

  • se_16S_agg: one that only has the clinical samples (if replicates, we keep the one with the most total counts) and the positive and negative controls (not the amplification controls).

For se_16S_agg, we add the uid column (the VIBRANT cross-assay unique identifier) to the colData and as the SE “sample id” so that we can merge with the other assays.

Code
# We only select the clinical samples and the positive and negative controls
se_16S_agg <- 
  se_16S[, se_16S$sample_type %in% c("Clinical sample", "Positive control", "Negative control")]

# we pick the samples with the most total counts when there are replicates
selected_samples <- 
  se_16S_agg@colData |>
  as.data.frame() |> 
  as_tibble() |>
  dplyr::filter(sample_type == "Clinical sample") |>
  select(sample_id, pid, visit_code, pool, total_counts) |> 
  distinct() |> 
  arrange(pid, visit_code, -total_counts) |> 
  group_by(pid, visit_code) |>
  slice_head(n = 1) |> 
  ungroup() |> 
  select(sample_id) |> 
  bind_rows(
    se_16S_agg@colData |>
      as.data.frame() |> 
      as_tibble() |>
      dplyr::filter(sample_type != "Clinical sample") |> 
      select(sample_id)
  ) |> 
  distinct()
  
se_16S_agg <- se_16S_agg[, selected_samples$sample_id]

# We create the VIBRANT uid

se_16S_agg@colData$uid <- 
  ifelse(
    !is.na(se_16S_agg@colData$pid), 
    str_c(
      se_16S_agg@colData$pid, "_",
      se_16S_agg@colData$visit_code
    ), 
    se_16S_agg@colData$sample_id
  )

# we use these uids as the sample IDs
se_16S_agg <- 
  SummarizedExperiment(
    assays = list(
      counts = assay(se_16S_agg, "counts") |> set_colnames(se_16S_agg$uid),
      rel_ab = assay(se_16S_agg, "rel_ab") |> set_colnames(se_16S_agg$uid)
    ),
    rowData = rowData(se_16S_agg),
    colData = se_16S_agg@colData |> set_rownames(se_16S_agg$uid),
    metadata = se_16S_agg@metadata
  )
Code
se_16S_agg
# A SummarizedExperiment-tibble abstraction: 3,365,223 × 56
# Features=817 | Samples=4119 | Assays=counts, rel_ab
   .feature         .sample counts rel_ab sample_id sample_id_barcode patient_id
   <chr>            <chr>    <dbl>  <dbl> <chr>     <chr>             <chr>     
 1 Lactobacillus_i… 068100…   6904 0.0933 202_068_… 202               068_10_00…
 2 Lactobacillus_c… 068100…      0 0      202_068_… 202               068_10_00…
 3 Gardnerella_vag… 068100…   8284 0.112  202_068_… 202               068_10_00…
 4 Ca_Lachnocurva_… 068100…      0 0      202_068_… 202               068_10_00…
 5 Sneathia_vagina… 068100…  21143 0.286  202_068_… 202               068_10_00…
 6 Fannyhessea_vag… 068100…   1601 0.0216 202_068_… 202               068_10_00…
 7 Sneathia_sangui… 068100…      0 0      202_068_… 202               068_10_00…
 8 Prevotella_bivia 068100…   1963 0.0265 202_068_… 202               068_10_00…
 9 Prevotella_amnii 068100…      0 0      202_068_… 202               068_10_00…
10 f_Dethiosulfovi… 068100…   3629 0.0490 202_068_… 202               068_10_00…
# ℹ 40 more rows
# ℹ 49 more variables: visit_code_ps <chr>, idt_adapter_name <chr>,
#   i5_index_forward <chr>, i5_index_reverse <chr>, i7_index_reverse <chr>,
#   i7_index <chr>, sample_id_2 <chr>,
#   spike_in_allobacillus_halotolerans_rel <dbl>,
#   spike_in_imtechella_halotolerans_rel <dbl>,
#   spike_in_allobacillus_halotolerans_counts <dbl>, …

Before saving the se_16S_agg object, we check that it is compatible with the requirements for integration into the MAE

Code
# we remove the columns pid, visit_code, and location

colData(se_16S_agg) <- colData(se_16S_agg)[, !colnames(colData(se_16S_agg)) %in% c("pid", "visit_code", "site", "location")]
# we also remove additional columns that are not needed for downstream analyses
unneeded_cols <- 
  c(
    "randomized", 
    "spike_in_allobacillus_halotolerans_rel", "spike_in_imtechella_halotolerans_rel",
    "spike_in_allobacillus_halotolerans_counts", "spike_in_imtechella_halotolerans_counts"
    )
colData(se_16S_agg) <- colData(se_16S_agg)[, !colnames(colData(se_16S_agg)) %in% unneeded_cols]

#
se_16S_agg$exclude_sample <- ifelse(se_16S_agg$total_counts < 5000, "Yes, low counts", "No")
ordered_cols <- unique(c("uid", "exclude_sample", "total_counts", "sample_type", "control_type", colnames(colData(se_16S_agg))))
colData(se_16S_agg) <- colData(se_16S_agg)[,ordered_cols]

se_16S_agg <- check_se(se_16S_agg)
Code
saveRDS(
  se_16S, 
  str_c(
    get_01_output_dir(),  
    "04_se_16S_raw_", today() |> str_remove_all("-"), ".rds"
    )
  )

saveRDS(
  se_16S_agg, 
  str_c(
    get_01_output_dir(),  
    "04_se_16S_agg_", today() |> str_remove_all("-"), ".rds"
    )
  )